{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0e4a9d73",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "import pickle\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6364998e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 85977 entries, 0 to 85976\n",
      "Data columns (total 5 columns):\n",
      " #   Column               Non-Null Count  Dtype \n",
      "---  ------               --------------  ----- \n",
      " 0   Speaker_Bioguide_ID  85977 non-null  object\n",
      " 1   Speaker_Name         85977 non-null  object\n",
      " 2   Text                 85977 non-null  object\n",
      " 3   Date                 85977 non-null  object\n",
      " 4   Legislative Body     85977 non-null  object\n",
      "dtypes: object(5)\n",
      "memory usage: 3.3+ MB\n",
      "None\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Speaker_Bioguide_ID</th>\n",
       "      <th>Speaker_Name</th>\n",
       "      <th>Text</th>\n",
       "      <th>Date</th>\n",
       "      <th>Legislative Body</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M001201</td>\n",
       "      <td>Mr. MITCHELL</td>\n",
       "      <td>Mr. Speaker, I rise today in the spirit of Mad...</td>\n",
       "      <td>2017-07-20</td>\n",
       "      <td>House</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>B001250</td>\n",
       "      <td>Mr. BISHOP of Utah</td>\n",
       "      <td>Mr. Speaker, I ask unanimous consent that all ...</td>\n",
       "      <td>2017-07-20</td>\n",
       "      <td>House</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>B001250</td>\n",
       "      <td>Mr. BISHOP of Utah</td>\n",
       "      <td>Mr. Chair, I include in the Record my statemen...</td>\n",
       "      <td>2017-07-20</td>\n",
       "      <td>House</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>B001250</td>\n",
       "      <td>Mr. BISHOP of Utah</td>\n",
       "      <td>Mr. Chair, I yield 5 minutes to the gentleman ...</td>\n",
       "      <td>2017-07-20</td>\n",
       "      <td>House</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Y000033</td>\n",
       "      <td>Mr. YOUNG of Alaska</td>\n",
       "      <td>Mr. Chairman, this is an issue that should hav...</td>\n",
       "      <td>2017-07-20</td>\n",
       "      <td>House</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Speaker_Bioguide_ID         Speaker_Name  \\\n",
       "0             M001201         Mr. MITCHELL   \n",
       "1             B001250   Mr. BISHOP of Utah   \n",
       "2             B001250   Mr. BISHOP of Utah   \n",
       "3             B001250   Mr. BISHOP of Utah   \n",
       "4             Y000033  Mr. YOUNG of Alaska   \n",
       "\n",
       "                                                Text        Date  \\\n",
       "0  Mr. Speaker, I rise today in the spirit of Mad...  2017-07-20   \n",
       "1  Mr. Speaker, I ask unanimous consent that all ...  2017-07-20   \n",
       "2  Mr. Chair, I include in the Record my statemen...  2017-07-20   \n",
       "3  Mr. Chair, I yield 5 minutes to the gentleman ...  2017-07-20   \n",
       "4  Mr. Chairman, this is an issue that should hav...  2017-07-20   \n",
       "\n",
       "  Legislative Body  \n",
       "0            House  \n",
       "1            House  \n",
       "2            House  \n",
       "3            House  \n",
       "4            House  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data = pd.read_csv('raw_original_data_floor_speeches_house.csv')\n",
    "print(raw_data.info())\n",
    "raw_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6bfcd430",
   "metadata": {},
   "outputs": [],
   "source": [
    "#1. we must have bio info on all the speakers\n",
    "legis_info = json.load(open('legislator-info-1990-2020.json'))\n",
    "legis_id_to_info = {}\n",
    "for x in legis_info:\n",
    "    legis_id_to_info[x['id']['bioguide']] = x\n",
    "del legis_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3990cb36",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n"
     ]
    }
   ],
   "source": [
    "speakers_to_remove_based_on_non_availbility_of_bio_info = set() \n",
    "speakers = list(raw_data['Speaker_Bioguide_ID'])\n",
    "for s in speakers:\n",
    "    if s not in legis_id_to_info:\n",
    "        speakers_to_remove_based_on_non_availbility_of_bio_info.add(s)\n",
    "print(len(speakers_to_remove_based_on_non_availbility_of_bio_info))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9ebdfbf8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 85733 entries, 0 to 85976\n",
      "Data columns (total 5 columns):\n",
      " #   Column               Non-Null Count  Dtype \n",
      "---  ------               --------------  ----- \n",
      " 0   Speaker_Bioguide_ID  85733 non-null  object\n",
      " 1   Speaker_Name         85733 non-null  object\n",
      " 2   Text                 85733 non-null  object\n",
      " 3   Date                 85733 non-null  object\n",
      " 4   Legislative Body     85733 non-null  object\n",
      "dtypes: object(5)\n",
      "memory usage: 3.9+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "raw_data = raw_data[~raw_data['Speaker_Bioguide_ID'].isin(speakers_to_remove_based_on_non_availbility_of_bio_info)]\n",
    "print(raw_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1f3f0e0d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "59\n"
     ]
    }
   ],
   "source": [
    "#remove speakers if they gave less than 25 speeches - TBIP paper removed senators with less than 24 speeches.\n",
    "speakers_to_remove_based_on_num_speeches = set()\n",
    "speakers = set(raw_data['Speaker_Bioguide_ID'])\n",
    "thresh = 25\n",
    "for s in speakers:\n",
    "    n_s = len(raw_data[raw_data['Speaker_Bioguide_ID']==s])\n",
    "    if n_s < thresh:\n",
    "        speakers_to_remove_based_on_num_speeches.add(s)\n",
    "print(len(speakers_to_remove_based_on_num_speeches))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "33632a0e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 85173 entries, 0 to 85976\n",
      "Data columns (total 5 columns):\n",
      " #   Column               Non-Null Count  Dtype \n",
      "---  ------               --------------  ----- \n",
      " 0   Speaker_Bioguide_ID  85173 non-null  object\n",
      " 1   Speaker_Name         85173 non-null  object\n",
      " 2   Text                 85173 non-null  object\n",
      " 3   Date                 85173 non-null  object\n",
      " 4   Legislative Body     85173 non-null  object\n",
      "dtypes: object(5)\n",
      "memory usage: 3.9+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "raw_data = raw_data[~raw_data['Speaker_Bioguide_ID'].isin(speakers_to_remove_based_on_num_speeches)]\n",
    "print(raw_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8988abcb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# stopwords = set()\n",
    "# names_in_cong_record = list(raw_data['Speaker_Name'])\n",
    "# for n in names_in_cong_record:\n",
    "#     l = n.split()\n",
    "#     for x in l:\n",
    "#         stopwords.add(x.lower())\n",
    "# bioguide_ids = set(raw_data['Speaker_Bioguide_ID'])\n",
    "# for bid in bioguide_ids:\n",
    "#     name = list(legis_id_to_info[bid]['name'].values())\n",
    "#     for x in name:\n",
    "#         for z in x.split():\n",
    "#             stopwords.add(z.lower().replace('\"', '').replace(\"'\", ''))\n",
    "            \n",
    "# #also add in the stopwords list used by TBIP paper authors to preprocess senate speeches data - it consists\n",
    "# #all state names, cities, month names, days of week, and other stopwords/procedural terms - very useful. \n",
    "# stopwords_from_senate_speeches_tbip = open('../../setup/stopwords/senate_speeches.txt').readlines()\n",
    "# stopwords_from_senate_speeches_tbip = list(map(lambda x:x.rstrip(), stopwords_from_senate_speeches_tbip))\n",
    "\n",
    "# stopwords = stopwords.union(set(stopwords_from_senate_speeches_tbip))\n",
    "# f = open('stopwords.txt', 'w')\n",
    "# for i, x in enumerate(list(stopwords)):\n",
    "#     f.write(x)\n",
    "#     if i < len(stopwords) - 1:\n",
    "#         f.write('\\n')\n",
    "# f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "92b5292e",
   "metadata": {},
   "outputs": [],
   "source": [
    "stopwords = open('stopwords.txt', 'r').readlines()\n",
    "stopwords = list(map(lambda x:x.rstrip(), stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "396bfa30",
   "metadata": {},
   "outputs": [],
   "source": [
    "#while it is possible to add more jargon terms perhaps, do not want to overdo stopwords, because words can be \n",
    "#highly contextual and have meaning."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "174ce607",
   "metadata": {},
   "outputs": [],
   "source": [
    "#rest of the preprocessing is following the script provided in the TBIP repo by Vafa et al. - setup/senate_speeches_to_bag_of_words.py \n",
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy import sparse\n",
    "from sklearn.feature_extraction.text import CountVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "053b0670",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "85173\n",
      "85173\n"
     ]
    }
   ],
   "source": [
    "speakers = list(raw_data['Speaker_Bioguide_ID'])\n",
    "print(len(speakers))\n",
    "speeches = list(raw_data['Text'])\n",
    "print(len(speeches))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "2b873edd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "85173\n",
      "513\n"
     ]
    }
   ],
   "source": [
    "speaker_to_speaker_id = dict(\n",
    "    [(y, x) for x, y in enumerate(sorted(set(speakers)))])\n",
    "author_indices = np.array(\n",
    "    [speaker_to_speaker_id[s] for s in speakers])\n",
    "print(len(author_indices))\n",
    "author_map = np.array(list(speaker_to_speaker_id.keys()))\n",
    "print(len(author_map))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "76fd924f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "85173\n"
     ]
    }
   ],
   "source": [
    "print(len(speeches))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c6f12345",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/pranavgoel/miniconda3/envs/pg3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:409: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['allard', 'andr', 'balart', 'barrag', 'col', 'colon', 'cortez', 'garc', 'gonz', 'guti', 'halleran', 'jes', 'jos', 'jr', 'lehtinen', 'lez', 'luj', 'mucarsel', 'nchez', 'ocasio', 'powell', 'ra', 'rdenas', 'ros', 'rourke', 'roybal', 'rrez', 'shea', 'sr', 'vel', 'wm', 'zquez'] not in stop_words.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "count_vectorizer = CountVectorizer(min_df=0.001,\n",
    "                                   max_df=0.75, \n",
    "                                   stop_words=stopwords, \n",
    "                                   ngram_range=(1, 3),\n",
    "                                   token_pattern=\"[a-zA-Z]+\")\n",
    "# Learn initial document term matrix. This is only initial because we use it to\n",
    "# identify words to exclude based on author counts.\n",
    "counts = count_vectorizer.fit_transform(speeches)\n",
    "vocabulary = np.array(\n",
    "    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), \n",
    "                            key=lambda kv: kv[1])])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e7cc8469",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(85173, 10588)\n",
      "10588\n"
     ]
    }
   ],
   "source": [
    "print(counts.shape)\n",
    "print(len(vocabulary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "d5178de2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 513/513 [00:05<00:00, 94.01it/s]\n"
     ]
    }
   ],
   "source": [
    "author_to_inds = {}\n",
    "for a in tqdm(list(author_map)):\n",
    "    inds = []\n",
    "    author_ind = speaker_to_speaker_id[a]\n",
    "    for i, ind in enumerate(list(author_indices)):\n",
    "        if ind==author_ind:\n",
    "            inds.append(i)\n",
    "    author_to_inds[a] = inds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "da8ef8b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_per_author_counts(counts, author_to_inds):\n",
    "    list_of_arrays = []\n",
    "    for a in author_to_inds:\n",
    "        inds = author_to_inds[a]\n",
    "        list_of_arrays.append(np.array(np.sum(counts[inds], 0)))#.reshape((1, counts.shape[1])))\n",
    "    return np.concatenate(list_of_arrays, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "cd824625",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(513, 10588)\n"
     ]
    }
   ],
   "source": [
    "# Remove phrases spoken by less than 50 representatives\n",
    "min_authors_per_word = 50\n",
    "counts_per_author = get_per_author_counts(counts, author_to_inds)\n",
    "print(counts_per_author.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "356f308a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10151\n"
     ]
    }
   ],
   "source": [
    "acceptable_words = []\n",
    "for i in range(len(vocabulary)):\n",
    "    if np.count_nonzero(counts_per_author[:, i]) >= min_authors_per_word:\n",
    "        acceptable_words.append(i)\n",
    "print(len(acceptable_words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "fc115984",
   "metadata": {},
   "outputs": [],
   "source": [
    "count_vectorizer = CountVectorizer(ngram_range=(1, 3),\n",
    "                                   vocabulary=vocabulary[acceptable_words])\n",
    "counts = count_vectorizer.fit_transform(speeches)\n",
    "vocabulary = np.array(\n",
    "    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), \n",
    "                            key=lambda kv: kv[1])])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "c20f97af",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(85173, 10151)\n",
      "10151\n"
     ]
    }
   ],
   "source": [
    "print(counts.shape)\n",
    "print(len(vocabulary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "6d7b3a2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `n_gram_to_unigram` takes as key an index to an n-gram in the vocabulary\n",
    "# and its value is a list of the vocabulary indices of the corresponding \n",
    "# unigrams.\n",
    "n_gram_indices = np.where(\n",
    "  np.array([len(word.split(' ')) for word in vocabulary]) > 1)[0]\n",
    "n_gram_to_unigrams = {}\n",
    "for n_gram_index in n_gram_indices:\n",
    "    matching_unigrams = []\n",
    "    for unigram in vocabulary[n_gram_index].split(' '):\n",
    "        if unigram in vocabulary:\n",
    "            matching_unigrams.append(np.where(vocabulary == unigram)[0][0])\n",
    "    n_gram_to_unigrams[n_gram_index] = matching_unigrams\n",
    "\n",
    "# `n_grams_to_bigrams` now breaks apart trigrams and higher to find bigrams \n",
    "# as subsets of these words.\n",
    "n_grams_to_bigrams = {}\n",
    "for n_gram_index in n_gram_indices:\n",
    "    split_n_gram = vocabulary[n_gram_index].split(' ')\n",
    "    n_gram_length = len(split_n_gram) \n",
    "    if n_gram_length > 2:\n",
    "        bigram_matches = []\n",
    "        for i in range(0, n_gram_length - 1):\n",
    "            bigram = \" \".join(split_n_gram[i:(i + 2)])\n",
    "            if bigram in vocabulary:\n",
    "                bigram_matches.append(np.where(vocabulary == bigram)[0][0])\n",
    "        n_grams_to_bigrams[n_gram_index] = bigram_matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "6a40b1ab",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 85173/85173 [01:33<00:00, 907.58it/s] \n"
     ]
    }
   ],
   "source": [
    "# Go through counts, and remove a unigram each time a bigram superset \n",
    "# appears. Also remove a bigram each time a trigram superset appears.\n",
    "# Note this isn't perfect: if bigrams overlap (e.g. \"global health care\" \n",
    "# contains \"global health\" and \"health care\"), we count them both. This\n",
    "# may introduce a problem where we subract a unigram count twice, so we also\n",
    "# ensure non-negativity.\n",
    "#counts_dense = counts.toarray()\n",
    "for i in tqdm(range(counts.shape[0])):\n",
    "    n_grams_in_doc = np.where(counts[i, n_gram_indices].toarray() > 0)[0]\n",
    "    sub_n_grams = n_gram_indices[n_grams_in_doc]\n",
    "    for n_gram in sub_n_grams:\n",
    "        counts[i, n_gram_to_unigrams[n_gram]] = sparse.csr_matrix(counts[i, n_gram_to_unigrams[n_gram]].toarray() - counts[i, n_gram])\n",
    "        if n_gram in n_grams_to_bigrams:\n",
    "            counts[i, n_grams_to_bigrams[n_gram]] = sparse.csr_matrix(counts[i, n_grams_to_bigrams[n_gram]].toarray() - counts[i, n_gram])\n",
    "counts[counts < 0] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "f437ac32",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(85173, 10151)\n"
     ]
    }
   ],
   "source": [
    "print(counts.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "bd1a3754",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 85173/85173 [00:04<00:00, 18334.22it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(84063, 10151)\n",
      "(84063,)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Remove speeches with no words.\n",
    "existing_speeches = []#np.where(np.sum(counts_dense, axis=1) > 0)[0]\n",
    "for i in tqdm(range(counts.shape[0])):\n",
    "    if counts[i].sum() > 0:\n",
    "        existing_speeches.append(i)\n",
    "counts = counts[existing_speeches]\n",
    "print(counts.shape)\n",
    "author_indices = author_indices[existing_speeches]\n",
    "print(author_indices.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "7adda18b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save data.\n",
    "\n",
    "# `counts.npz` is a [num_documents, num_words] sparse matrix containing the\n",
    "# word counts for each document.\n",
    "sparse.save_npz(\"clean/counts.npz\",\n",
    "                counts.astype(np.float32))\n",
    "\n",
    "# `author_indices.npy` is a [num_documents] vector where each entry is an\n",
    "# integer indicating the author of the corresponding document.\n",
    "np.save(\"clean/author_indices.npy\", author_indices)\n",
    "\n",
    "# `vocabulary.txt` is a [num_words] vector where each entry is a string\n",
    "# denoting the corresponding word in the vocabulary.\n",
    "np.savetxt(\"clean/vocabulary.txt\", vocabulary, fmt=\"%s\")\n",
    "\n",
    "# `author_map.txt` is a [num_authors] vector of strings providing the bioguide ID of\n",
    "# each author in the corpus.\n",
    "np.savetxt(\"clean/author_map.txt\", author_map, fmt=\"%s\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "ede09272",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `raw_documents.txt` contains all the documents we ended up using.\n",
    "raw_documents = [document.replace(\"\\n\", ' ').replace(\"\\r\", ' ') \n",
    "                 for i, document in enumerate(speeches) if i in existing_speeches]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "488ba607",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "84063\n"
     ]
    }
   ],
   "source": [
    "print(len(raw_documents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "2629ac9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open('clean/raw_documents.txt', 'w')\n",
    "for i, doc in enumerate(raw_documents):\n",
    "    f.write(doc)\n",
    "    if i < len(raw_documents) - 1:\n",
    "        f.write('\\n')\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6576581a-db21-4438-8d00-6b5008979b72",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "894bd1de",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "84063"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(existing_speeches)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "41a54f8d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 84063 entries, 0 to 85975\n",
      "Data columns (total 5 columns):\n",
      " #   Column               Non-Null Count  Dtype \n",
      "---  ------               --------------  ----- \n",
      " 0   Speaker_Bioguide_ID  84063 non-null  object\n",
      " 1   Speaker_Name         84063 non-null  object\n",
      " 2   Text                 84063 non-null  object\n",
      " 3   Date                 84063 non-null  object\n",
      " 4   Legislative Body     84063 non-null  object\n",
      "dtypes: object(5)\n",
      "memory usage: 3.8+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "raw_data = raw_data.iloc[existing_speeches]\n",
    "print(raw_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "a18bc991",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data.to_csv('finalized_tbip_speech_set_raw_original_data_floor_speeches_house.csv', index=False)\n",
    "#save this if needed. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e8acf0f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
